set autotabgraphs on

use "HALO CME data.dta", clear
qui gen double speed = cond(ndorderspeedinitial==., linearspeed, ndorderspeedinitial)
label var speed "Speed (km/sec)"
drop if speed==.
qui gen double lspeed = ln(speed)

gen double t = clock(date+ " " + time, "MDYhms") // use non-UTC time so obs are perfectly spaced at 3.6 million milliseconds
format t %tc
qui gen week=wofd(date(date, "MDY"))
qui gen year=yofd(date(date, "MDY"))
format week %tw

* keep if week<=tw(2010w13) // approximate Riley sample

sum t
scalar T = (r(max)-r(min))/1000/60/60/24/365.25/10 // decades of data
scalar N = _N
 
sort speed
gen int n = _n
qui gen double compcdf = 1-(_n-1)/_N
label var compcdf "Probability of CME at least this fast"

histogram speed, frac bin(50) xsize(6.5) ysize(4) lcolor(black) lwidth(thick)

extreme plot speed, mrl(0 4000)

// make fake data points for extrapolating to Carrington
qui forvalues i=`=ln(2500)'(`=ln(5000/2500)/100')`=ln(5000)+.00001' {
	set obs `=_N+1'
	replace speed = exp(`i') in `=_N'
}
gen byte fake = _n>N
qui replace lspeed = ln(speed) if fake
sort speed

scalar threshold = 1500
extreme gpd speed if !fake, thresh(`=threshold')

sum n if speed>=threshold & !fake
scalar extremep = compcdf[r(min)]
predictnl double gpdfit = (1+[xi]_cons*(speed-`=threshold')/exp([lnsig]_cons))^(-1/[xi]_cons) * extremep if speed>=threshold, force // cumulative CDF of GPD, scaled to total CDF of tail
label var gpdfit "Generalized Pareto Distribution fit"

preserve
keep if e(sample)
scalar extremep = compcdf
gen y = _n/(_N+1)
gen x = 1- gpdfit / extremep
scatter y x || function y=x,  name(ProbabilityPlot, replace)  // probability plot, test of goodness of fit (Coles, ch. 2)--should be close to a straight line
replace x = ((1-y)^(-[xi]_cons) - 1) * exp([lnsig]_cons) / [xi]_cons + threshold
scatter speed x || function y=x, range(`=threshold' 3500) name(QuantilePlot, replace) xlabel(`=threshold'(500)3500) ylabel(`=threshold'(500)3500) // quantile plot, test of goodness of fit (Coles, ch. 2)--should be close to a straight line
twoway histogram speed, bin(10) || function y=(1+[xi]_cons*(x-`=threshold')/exp([lnsig]_cons))^(-1/[xi]_cons)/exp([lnsig]_cons), range(2000 3500) ///
	, name(GPDdist, replace) legend(off) // empirical vs. fitted distribution
restore

bootstrap xi=[xi]_cons lnsig=[lnsig]_cons converged=e(converged) extremeN=e(N), seed(987654321) reps(1000) saving(GPDfit, replace): ///
	extreme gpd speed if !fake, thresh(`=threshold') iter(100)
gen double gpdfit_lo= .
gen double gpdfit_hi= .
sum n if speed>=threshold, meanonly
qui forvalues n=`r(min)'/`=_N' {
	scalar thisspeed = speed[`n']
	preserve
	use GPDfit, clear
	keep if converged
	gen double p = cond(1+xi/exp(lnsig)*(thisspeed-threshold)>0, (1+xi/exp(lnsig)*(thisspeed-threshold))^(-1/xi) * extremeN/N, 0) // GPD prob of event at least this big; extremep var supersedes extremep scalar
	centile p, centile(2.5 50 97.5)
	restore
	replace gpdfit_lo = r(c_1) in `n'
	replace gpdfit_hi = r(c_3) in `n'
	replace gpdfit = r(c_2) in `n'
}

scalar RileyThreshold = 2000
powerlaw speed if !fake, threshold(RileyThreshold)
predictnl double lrileyfit = ln(exp(-([alpha]_cons - 1) * (ln(`e(depvar)') - ln(`e(threshold)'))) * e(N)/N), force ci(lriley_lo lriley_hi) level(95) // Scale log-log fit line to CDF of interval
qui gen double rileyfit=exp(lrileyfit)
label var rileyfit "Riley-style power law fit"
gen double riley_lo=exp(lriley_lo)
gen double riley_hi=exp(lriley_hi)

di "per event:  95% GPD CI for 5000: [" %12.10f gpdfit_lo[_N] "," %12.10f gpdfit_hi[_N] "] per event. Median = " %12.10f gpdfit[_N]
di "per decade: 95% GPD CI for 5000: [" 1-(1-gpdfit_lo[_N])^(N/T) "," 1-(1-gpdfit_hi[_N])^(N/T) "] per decade. Median = " 1-(1-gpdfit[_N])^(N/T)
di "per event:  95% Riley-style CI for 5000: [" %12.10f riley_lo[_N] "," %12.10f riley_hi[_N] "] per event. Median = " %12.10f rileyfit[_N]
di "per decade: 95% Riley-style CI for 5000: [" 1-(1-riley_lo[_N])^(N/T) "," 1-(1-riley_hi[_N])^(N/T) "] per decade. Median = " 1-(1-rileyfit[_N])^(N/T)

set scheme s1color
* compare to Riley (2012), Fig 5
scatter compcdf speed if !fake, msize(medium) mcolor(black) || line rileyfit speed if rileyfit<1, lwidth(medthick) lcolor(purple) || line gpdfit speed if !fake, lwidth(medthick) lcolor(orange) || if speed>=100, xscale(log) yscale(log) ///
	xlabel(100 200 500 700 1000 2000 5000) xtick(100(100)1000 1000(1000)5000, grid) ylabel(.00001 .0001 .001 .01 .1 1) ytick(.00001(.00001).0001 .0001(.0001).001 .001(.001).01 .01(.01).1 .1(.1)1) ///
	name(CompCDFPerEvent, replace) xline(5000) xsize(6.5) ysize(4) ytitle("Probability of a CME at least this fast") legend(label(1 Data) rows(1))

gen _speed = 1.002 *speed
twoway rspike riley_lo riley_hi _speed if fake, lcolor(maroon) || rspike gpdfit_lo gpdfit_hi speed if fake, lcolor(orange) || ///
	line rileyfit _speed, lcolor(purple) || line gpdfit speed, lcolor(orange) || ///
	scatter compcdf speed, msize(small) mcolor(black) || ///
	if speed>=2500, xscale(log)  ///
	xlabel(2500(200)5000) legend(rows(1)) ///
	name(CompCDFPerEventCILinear, replace) xsize(6.5) ysize(4) legend(label(5 Data) rows(1) order(5 3 4)) xline(5000) ytitle("Probability of a CME at least this fast")

twoway rarea gpdfit_lo gpdfit_hi speed, fintensity(inten30) astyle(ci) ||  ///
	scatter compcdf speed, msize(small) msymbol(Oh) || line gpdfit speed || ///
	if speed>=3000 & (week<=tw(2010w52) | _n==_N), xscale(log)  ///
	xlabel(3000(200)5000) ///
	name(CompCDFPerEventCILinearNoRiley, replace) xsize(6.5) ysize(4)


-----
**** Max-spectrum analysis like in Ruzmaikin, Feynman, and Stoev

use "HALO CME data.dta", clear
qui gen year=yofd(date(date, "MDY"))
qui gen double speed = cond(ndorderspeedinitial==., linearspeed, ndorderspeedinitial)
keep if year>=1999 & year<=2006 & speed!=.
gen long hour = floor(Clock(date+" "+time, "MDYhms") / (1000*3600))
collapse (max) speed, by(hour)
tsset hour
tsfill
recode speed . = 0
gen t = uniform()
* sort t // uncomment to randomly reorder
mata
speed = .; st_view(speed, ., "speed")
J = floor(ln(rows(speed))/ln(2))
maxspectrum = J(J, 1, .)
(X = J(2^(J+1), 1, .))[|1\rows(speed)|] = speed
for (j=1; j<=J; j++)
	maxspectrum[j] = mean(log(rowmax(colshape(X, 2^j)))) / log(2)
j = 1::j
end
getmata maxspectrum j, force replace
gen expmaxspectrum = 2^maxspectrum
scatter expmaxspectrum j, ylabel(454 665 991 1406 1889 2673) yscale(log) ///
	xlabel(2 "4h" 4 "16h" `=ln(3*24)/ln(2)' "3d" `=ln(11*24)/ln(2)' "11d" `=ln(43*24)/ln(2)' "43d" `=ln(171*24)/ln(2)' "171d" `=ln(1.9*365.25*24)/ln(2)' "1.9y" `=ln(7.5*365.25*24)/ln(2)' "7.5y")
regress maxspectrum j if exp(maxspectrum)>700
di "Estimated alpha = " 1/_b[j]

--------
// max spectrum of lognormal distribution
// one-time

mata
J = 20
reps = 100
maxspectrum = J(0, J, 0)
// X = jumble(((0::2^J-1)/2^J):^(-1/(3-1))) power law
X = exp(invnormal((0::2^J-1)/2^J))
rseed(987654321)
for (i=reps; i; i--) {
	X = jumble(X)
	maxspectrum_i = J(1, J, .)
	(X2 = J(2^(J+1), 1, .))[|1\rows(X)|] = X
	for (j=1; j<=J; j++)
		maxspectrum_i[j] = mean(log(rowmax(colshape(X2, 2^j))))
	maxspectrum = maxspectrum \ maxspectrum_i
}
mu = mean(maxspectrum)/ln(2)
se = maxspectrum :- mu; se = sqrt(mean(se:*se)/(reps-1))
mu=mu';se=se';j=1::J
"j, mu, se"
j, mu, se
end
getmata maxspectrum=mu se j, force replace
cap drop expmaxspectrum lo hi
gen expmaxspectrum = 2^maxspectrum
gen lo = 2^(maxspectrum-1.96*se)
gen hi = 2^(maxspectrum+1.96*se)
scatter expmaxspectrum j, yscale(log)
twoway rarea lo hi j || line expmaxspectrum j, yscale(log)


